Qiime2 Pipeline

Step 1: Import accession IDs

Import list of IDs into a NCBI Accession IDs artifact

qiime tools import \
              --type NCBIAccessionIDs \
              --input-path Protein.tsv \
              --output-path Protein.qza

Step 2: Fetching metadata and sequences

Fetch both sequence-associated metadata and sequences associated with the provided IDs

qiime fondue get-all \
              --i-accession-ids Protein.qza \
              --output-dir Meta_Analysis

Step 3: Create a manifest file

#You will also need a metadata file that contains all the other data related to the sample from which the sequences are obtained

Manifest files: Protein_manifest
Metadata file: FINAL_Metadata.tsv

Step 4: Import manifest file

qiime tools import \
  --type 'SampleData[SequencesWithQuality]' \
  --input-path Protein_manifest \
  --output-path Protein_demux_NEW.qza \
  --input-format SingleEndFastqManifestPhred33

Step 5: Demultiplexing sequences

(Single reads)
qiime demux summarize \
      --i-data Protein_demux_NEW.qza \
      --o-visualization Protein_demux_NEW.qzv

qiime tools view Protein_demux_NEW.qzv

Step 6: Dada2 Sequence quality control and feature table construction.

qiime dada2 denoise-single \
  --i-demultiplexed-seqs Protein_demux_NEW.qza \
  --p-trim-left 0 \
  --p-trunc-len 150 \
  --o-representative-sequences Protein_rep-seqs.qza \
  --o-table Protein_table.qza \
  --o-denoising-stats Protein_stats.qza

Step 7: FeatureTable and FeatureData summaries

qiime feature-table summarize \
  --i-table Protein_table.qza \
  --o-visualization Protein_table.qzv \
  --m-sample-metadata-file FINAL_Metadata.tsv
qiime feature-table tabulate-seqs \
  --i-data Protein_rep-seqs.qza \
  --o-visualization Protein_rep-seqs.qzv

Step 8: Generate a tree for phylogenetic diversity analyses

qiime phylogeny align-to-tree-mafft-fasttree \
  --i-sequences PROTEIN_NEW_rep-seqs.qza \
  --o-alignment Protein_NEW_aligned-rep-seqs.qza \
  --o-masked-alignment Protein_NEW_masked-aligned-rep-seqs.qza \
  --o-tree Protein_NEW_unrooted-tree.qza \
  --o-rooted-tree Protein_NEW_rooted-tree.qza

Step 9: Taxonomic analysis

#This is necessary to begin to explore the taxonomic composition of the samples, and again relate that to sample metadata #Train classifier or download Silva or Greengene trained classifier

qiime feature-classifier classify-sklearn \
  --i-classifier silva-138.1-ssu-nr99-classifier.qza \
  --i-reads Protein_rep-seqs.qza \
  --o-classification Protein_taxonomy.qza

END OF QIIME2

Launch R Studio

Install the required Packages

R Studio

knitr::opts_chunk$set(
  message = FALSE,
  warning = FALSE,
  echo = TRUE
)

Step 10. Load required Libraries

library(conflicted)
# Set conflict preferences
suppressMessages({
 conflicts_prefer(microbiome::diversity)
  conflicts_prefer(qiime2R::read_qza)
  conflicts_prefer(microbiomeMarker::tax_table)
  conflicts_prefer(microbiomeMarker::nsamples)
  conflicts_prefer(stats::dist)
  conflicts_prefer(microbiomeMarker::ntaxa)
  conflicts_prefer(phyloseq::plot_heatmap)
  conflicts_prefer(base::as.matrix)
  conflicts_prefer(phyloseq::ntaxa)
  conflicts_prefer(base::setdiff)
  conflicts_prefer(plotly::filter)
  conflicts_prefer(ggplot2::margin)
  conflicts_prefer(base::rownames)
  conflicts_prefer(base::colnames)
  conflicts_prefer(ggplot2::annotate)
  conflicts_prefer(microbiome::transform)
})

# List of required libraries
libraries <- c("NBZIMM", "broom", "caret", "cluster", "codyn",
               "cowplot", "devtools", "dplyr",
               "edgeR", "fpc", "ggplot2", "ggplotify", "ggpmisc",
               "ggpubr", "ggRandomForests", "ggrepel", "ggsci",
               "grid", "gridExtra", "igraph", "LDAvis",
               "limma", "magrittr", "MASS", "metamicrobiomeR",
               "microbiome", "patchwork", "phyloseq", "proxy", "randomForest",
               "randomForestSRC", "RColorBrewer","scales", "slam",
               "splitstackshape", "stm", "tensorflow", "tibble",
               "tidyverse", "topicmodels", "usethis", "vegan", "lme4",
               "rbiom", "qiime2R", "ggVennDiagram", "ggvenn",
               "lattice", "permute", "SpiecEasi", "conflicted",
               "biomformat", "mixOmics", "kableExtra", "nlme", 
               "glmmTMB", "gghighlight", "R.utils",
               "microbiomeutilities", "microbiomeMarker", "microViz", "ggpubr", "plotly",
               "pander", "pROC", "ggdendro", "grid", "ggpubr", "ggpattern")

# Load libraries silently
invisible(lapply(libraries, function(pkg) {
  suppressWarnings(suppressPackageStartupMessages(library(pkg, character.only = TRUE)))
}))

Step 11: Create the phyloseq object from Qiime2 output

Protein <- qza_to_phyloseq(
  features = "TWO_table.qza",        # Feature table file
  tree = "Protein_rooted-tree.qza", # Phylogenetic tree file
  taxonomy = "PROTEIN_NEW_taxonomy.qza", # Taxonomy file
  metadata = "FINAL_Metadata.tsv"   # Metadata file
)

# Filter taxa with counts greater than 1 across all samples
Protein_filt <- prune_taxa(taxa_sums(Protein) > 1, Protein)

# Output a summary of the filtered phyloseq object
Protein_filt
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 10123 taxa and 187 samples ]
## sample_data() Sample Data:       [ 187 samples by 26 sample variables ]
## tax_table()   Taxonomy Table:    [ 10123 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 10123 tips and 10061 internal nodes ]

Step 12: Venn Diagram of Core Microbiota at Genus Level

Protein_filt@tax_table %>% 
  data.frame() %>% 
  group_by(Phylum) %>% 
  summarise(n = length(Genus)) 
## # A tibble: 36 × 2
##    Phylum               n
##    <chr>            <int>
##  1 Acidobacteriota    183
##  2 Actinobacteriota   757
##  3 Armatimonadota      14
##  4 Bacteroidota      2982
##  5 Bdellovibrionota    21
##  6 Caldisericota        3
##  7 Calditrichota        2
##  8 Campylobacterota    44
##  9 Chloroflexi        236
## 10 Cloacimonadota       2
## # ℹ 26 more rows
bac_Protein_filt <- subset_taxa(Protein_filt, Kingdom == "d__Bacteria")

bac_Protein_df <- psmelt(bac_Protein_filt)
#Now we can more easily summarise our metagenomes by sample using standard synta##
number_of_taxa <- bac_Protein_df %>% 
  dplyr::filter(Abundance > 5) %>% 
  group_by(Protein_source == "Plant", Genus) %>% #Protein_type can be changed to any group
  summarise(n = length(Abundance))

number_of_taxa2 <- bac_Protein_df %>% 
  dplyr::filter(Abundance > 5) %>% 
  group_by(Protein_source == "Animal", Genus) %>% #Protein_type can be changed to any group
  summarise(n = length(Abundance))

#To place the sets of genera in a list, we use 
venn_data <- list(Plant = c(number_of_taxa$Genus[number_of_taxa$`Protein_source == "Plant"`]),
                  Animal = c(number_of_taxa2$Genus[number_of_taxa2$`Protein_source == "Animal"`]))

# Plot the Venn diagram and specify colors for each set
ggvenn(
  venn_data,
  fill_color = c("blue", "red"),
  stroke_size = 0.5, set_name_size = 4
)

Step 13: Unique Microbiota in Plant Proteins at Genus Level

# Step 1: Extract unique taxa
plant_taxa <- number_of_taxa$Genus[number_of_taxa$`Protein_source == "Plant"`]
animal_taxa <- number_of_taxa2$Genus[number_of_taxa2$`Protein_source == "Animal"`]

unique_plant_taxa <- setdiff(plant_taxa, animal_taxa)
unique_animal_taxa <- setdiff(animal_taxa, plant_taxa)

# Preview unique taxa
#head(unique_plant_taxa)
#head(unique_animal_taxa)

# Step 2: Extract abundance data for unique taxa
unique_plant_data <- number_of_taxa[number_of_taxa$Genus %in% unique_plant_taxa, ]
unique_animal_data <- number_of_taxa2[number_of_taxa2$Genus %in% unique_animal_taxa, ]

# Optional: Preview the data frames
#head(unique_plant_data)
#head(unique_animal_data)
# Step 3: Summarize abundance for unique plant taxa
unique_plant_abundance <- unique_plant_data %>%
  group_by(Genus) %>%
  summarize(TotalAbundance = sum(n, na.rm = TRUE)) %>%
  arrange(desc(TotalAbundance))

# Step 4: Identify the top 20 unique plant taxa
top_20_unique_taxa <- unique_plant_abundance$Genus[1:min(20, nrow(unique_plant_abundance))]

# Step 5: Group remaining taxa as "Others"
pie_data_combined <- unique_plant_data %>%
  mutate(Taxa_Genus = ifelse(Genus %in% top_20_unique_taxa, Genus, "Others")) %>%
  group_by(Taxa_Genus) %>%
  summarize(TotalAbundance = sum(n, na.rm = TRUE)) %>%
  mutate(Percentage = TotalAbundance / sum(TotalAbundance) * 100) %>%
  arrange(desc(TotalAbundance))
#pie_data_combined
# Step 6: Create a distinct color palette
if (nrow(pie_data_combined) > 0) {
  distinct_colors <- brewer.pal(n = 11, name = "Paired")  # Primary palette
  distinct_colors <- c(distinct_colors, brewer.pal(n = 7, name = "Dark2"), brewer.pal(n = 3, name = "Set3"))  # Additional colors
  distinct_colors <- distinct_colors[1:nrow(pie_data_combined)]  # Ensure enough colors for the data
} else {
  stop("Error: `pie_data_combined` is empty. Check your data processing steps.")
}

# Step 7: Add percentages to legend labels
if ("Taxa_Genus" %in% colnames(pie_data_combined) && "Percentage" %in% colnames(pie_data_combined)) {
  pie_data_combined <- pie_data_combined %>%
    mutate(Taxa_Genus = paste0(Taxa_Genus, " (", round(Percentage, 1), "%)"))
} else {
  stop("Error: `pie_data_combined` does not have the required columns. Ensure `Taxa_Genus` and `Percentage` exist.")
}

# Step 8: Plot the pie chart
if (nrow(pie_data_combined) > 0) {
  pie_chart_combined <- ggplot(pie_data_combined, aes(x = "", y = TotalAbundance, fill = Taxa_Genus)) +
    geom_bar(stat = "identity", width = 1) +
    coord_polar(theta = "y") +
    labs(
      title = "Unique Plant Taxa",
      fill = "Genus"
    ) +
    theme_void() +
    theme(
      legend.position = "bottom",
      legend.key.size = unit(0.3, "cm"),
      legend.text = element_text(size = 7),
      plot.title = element_text(face = "bold", hjust = 0.5, size = 14),
      plot.margin = margin(t = 10, r = 10, b = 30, l = 10)
    ) +
    scale_fill_manual(values = distinct_colors)

  # Step 9: Display the pie chart
  print(pie_chart_combined)
}

Step 14: Unique Microbiota in Animal Proteins at Genus Level

# Step 1: Filter unique animal taxa
unique_animal_group <- unique_animal_data %>%
  filter(Genus %in% unique_animal_taxa)

# Step 2: Summarize abundance for unique animal taxa
unique_animal_abundance <- unique_animal_group %>%
  group_by(Genus) %>%
  summarize(TotalAbundance = sum(n, na.rm = TRUE)) %>%
  arrange(desc(TotalAbundance))

# Step 3: Identify the top 20 unique animal taxa
top_20_unique_animal_taxa <- unique_animal_abundance$Genus[1:min(20, nrow(unique_animal_abundance))]

# Step 4: Group remaining taxa as "Others"
pie_data_combined <- unique_animal_group %>%
  mutate(Taxa_Genus = ifelse(Genus %in% top_20_unique_animal_taxa, Genus, "Others")) %>%
  group_by(Taxa_Genus) %>%
  summarize(TotalAbundance = sum(n, na.rm = TRUE)) %>%
  mutate(Percentage = TotalAbundance / sum(TotalAbundance) * 100) %>%
  arrange(desc(TotalAbundance))

# Step 5: Add percentages to legend labels
pie_data_combined <- pie_data_combined %>%
  mutate(
    Taxa_Genus = paste0(Taxa_Genus, " (", round(Percentage, 1), "%)")  # Add percentage in brackets
  )

# Step 6: Use the same distinct color palette as before
distinct_colors <- brewer.pal(n = 11, name = "Paired")  # Primary palette
distinct_colors <- c(distinct_colors, brewer.pal(n = 7, name = "Dark2"), brewer.pal(n = 3, name = "Set3"))  # Additional colors
distinct_colors <- distinct_colors[1:nrow(pie_data_combined)]  # Ensure enough colors for the data

# Step 7: Plot the pie chart
pie_chart_combined <- ggplot(pie_data_combined, aes(x = "", y = TotalAbundance, fill = Taxa_Genus)) +
  geom_bar(stat = "identity", width = 1) +
  coord_polar(theta = "y") +
  labs(
    title = "Unique Animal Taxa",
    fill = "Genus"
  ) +
  theme_void() +
  theme(
    legend.position = "bottom",
    legend.key.size = unit(0.3, "cm"),  # Adjust legend size
    legend.text = element_text(size = 7),  # Reduce legend text size
    plot.title = element_text(face = "bold", hjust = 0.5, size = 14),  # Center and enlarge title
    plot.margin = margin(t = 10, r = 10, b = 30, l = 10)  # Add bottom margin
  ) +
  scale_fill_manual(values = distinct_colors)  # Apply distinct colors

# Step 8: Display the pie chart
print(pie_chart_combined)

Step 15: Bar Plots Core Microbiota Composition Genus

# Step 1: Filter and clean the phyloseq object
Protein_filt <- Protein_filt %>%
  tax_fix(unknowns = c("uncultured", "unclassified", "unknown", "NA")) %>%  # Replace unknown values
  tax_filter(min_prevalence = 1) %>%  # Retain taxa with at least 1 prevalence
  phyloseq_validate()  # Validate the phyloseq object
#print(Protein_filt)

# Step 2: Check and handle NAs in the Genus rank
if (any(is.na(tax_table(Protein_filt)[, "Genus"]))) {
  cat("NAs found in Genus rank. Replacing with 'Unknown'.\n")
  tax_table(Protein_filt)[, "Genus"] <- ifelse(
    is.na(tax_table(Protein_filt)[, "Genus"]),
    "Unknown",
    tax_table(Protein_filt)[, "Genus"]
  )
}

# Step 3: Create the bar plot
p1 <- Protein_filt %>%
  ps_select(Protein_source, Protein_level) %>%  # Select relevant metadata columns
  phyloseq::merge_samples(group = "Protein_source") %>%  # Merge samples by "Protein_source"
  comp_barplot(
    tax_level = "Genus",  # Aggregation level for plotting
    n_taxa = 19,  # Number of taxa to display
    bar_width = 0.8,  # Adjust bar width
    position = position_dodge(width = 0.6),  # Adjust spacing between bars
    transform = function(x) x * 100  # Convert relative abundance to percentages
  ) +
  labs(
    x = "Protein Source",  # X-axis label
    y = "Relative Abundance (%)"  # Y-axis label indicating percentage
  ) +
  theme_classic() +  # Apply a clean theme
  theme(
    axis.text.x = element_text(face = "bold", size = 12),  # Customize x-axis text
    axis.text.y = element_text(face = "bold", size = 12),  # Customize y-axis text
    axis.title.x = element_text(face = "bold", size = 14),  # Bold x-axis title
    axis.title.y = element_text(face = "bold", size = 14),  # Bold y-axis title
    legend.text = element_text(size = 10),  # Adjust legend text size
    legend.title = element_text(face = "bold")  # Bold legend title
  )

# Step 4: Ensure proper ordering of Protein_source levels
if ("Protein_source" %in% colnames(p1$data)) {
  p1$data$Protein_source <- factor(p1$data$Protein_source, levels = c("Animal", "Plant"))
}

# Step 5: Display the plot
print(p1)

Evaluation of Microbial Diversity

Alpha Diversity

Step 16: Evaluation of Sequencing Depth for Alpha diversity

# Required Libraries
library(phyloseq)
library(ggplot2)
library(dplyr)

# Step 1: Extract Sequencing Depth
sample_depth <- data.frame(
  SampleID = sample_names(Protein_filt),             # Sample names
  SequencingDepth = sample_sums(Protein_filt)        # Total reads per sample
)

# Step 2: Plot Sequencing Depth
sequencing_depth_plot <- ggplot(sample_depth, aes(x = reorder(SampleID, -SequencingDepth), y = SequencingDepth)) +
  geom_bar(stat = "identity", fill = "steelblue") +  # Bar plot
  theme_classic() +                                  # Clean theme
  labs(
    x = "Samples",
    y = "Sequencing Depth (Total Reads)",
    title = "Sequencing Depth Across Samples"
  ) +
  theme(
    axis.text.x = element_blank(),             # Remove x-axis text
    axis.ticks.x = element_blank(),            # Remove x-axis ticks
    axis.text.y = element_text(face = "bold"), # Bold y-axis text
    axis.title = element_text(face = "bold", size = 12), # Bold axis titles
    plot.title = element_text(face = "bold", size = 14, hjust = 0.5) # Centered title
  ) +
  geom_hline(
    yintercept = median(sample_depth$SequencingDepth), 
    linetype = "dashed", color = "red", size = 1
  ) + # Median reference line
  annotate(
    "text", 
    x = 1, 
    y = median(sample_depth$SequencingDepth) + 500, 
    label = paste("Median =", median(sample_depth$SequencingDepth)), 
    color = "red", size = 4, hjust = 0
  )

# Display Sequencing Depth Plot
print(sequencing_depth_plot)

# Step 3: Calculate Alpha Diversity
alpha_diversity <- estimate_richness(Protein_filt, measures = c("Shannon", "Observed")) %>%
  mutate(
    SequencingDepth = sample_sums(Protein_filt),  # Add sequencing depth
    SampleID = rownames(.)               # Add sample IDs
  )

# Step 4: Correlation Between Sequencing Depth and Shannon Index
cor_result <- cor.test(alpha_diversity$SequencingDepth, alpha_diversity$Shannon, method = "spearman")
cor_rho <- round(cor_result$estimate, 3)   # Spearman rho
cor_pval <- signif(cor_result$p.value, 3)  # p-value

# Step 5: Plot Sequencing Depth vs Alpha Diversity
shannon_plot <- ggplot(alpha_diversity, aes(x = SequencingDepth, y = Shannon)) +
  geom_point(size = 3, color = "steelblue") + # Scatter plot
  geom_smooth(method = "lm", color = "red", linetype = "dashed") + # Add linear trend line
  labs(
    title = "Relationship Between Sequencing Depth and Microbial Diversity",
    x = "Sequencing Depth (Total Reads)",
    y = "Shannon Diversity Index"
  ) +
  theme_classic() + # Clean theme
  theme(
    plot.title = element_text(face = "bold", size = 14, hjust = 0.5),
    axis.title = element_text(face = "bold", size = 12),
    axis.text = element_text(size = 10)
  ) +
  annotate(
    "text", 
    x = max(alpha_diversity$SequencingDepth) * 0.7, 
    y = max(alpha_diversity$Shannon) * 0.95,
    label = paste0("Spearman's rho = ", cor_rho, "\nP-value = ", cor_pval),
    color = "red", size = 5, hjust = 0
  )

# Display Shannon Plot
print(shannon_plot)

Step 17: Generalized Linear Mixed Effects Model

# Step 1: Calculate Alpha Diversity (e.g., Shannon Index)
alpha_div <- estimate_richness(Protein_filt, measures = "Shannon")

# Step 2: Combine Alpha Diversity with Metadata
metadata <- data.frame(sample_data(Protein_filt))
alpha_div_df <- cbind(alpha_div, metadata)

# Step 3: Convert Categorical Variables to Numeric
numeric_vars <- c("Protein_source", "Country", "Isolation_source", "Protein_level", "Age_in_weeks")
alpha_div_df[numeric_vars] <- lapply(alpha_div_df[numeric_vars], function(x) as.numeric(factor(x)))

# Step 4: Compute Correlation Matrix for Predictors
cor_matrix <- cor(alpha_div_df[, numeric_vars], use = "complete.obs")
print("Correlation Matrix:")
## [1] "Correlation Matrix:"
# Step 5: Fit Linear Mixed-Effects Models
# Model 1: Protein Source as fixed effect, Country as random effect
model1 <- lmer(Shannon ~ Protein_source + (1 | Isolation_source), data = alpha_div_df)

# Model 2: Add Protein Level and Isolation Source as fixed effects
model2 <- lmer(Shannon ~ Protein_source + Protein_level + Country + (1 | Isolation_source), data = alpha_div_df)

# Model 3: Add Age in Weeks as a fixed effect
model3 <- lmer(Shannon ~ Protein_source + Protein_level + Age_in_weeks + Country + (1 | Isolation_source), data = alpha_div_df)

# Model 4: Protein Source and Protein Level as fixed effects
model4 <- lmer(Shannon ~ Protein_source + Protein_level + (1 | Isolation_source), data = alpha_div_df)

# Model 5: Add Age in Weeks and Country as fixed effects
model5 <- lmer(Shannon ~ Protein_source + Protein_level + Age_in_weeks + Country + (1 | Isolation_source), data = alpha_div_df)

# Model 6: Protein Source, Protein Level, and Age in Weeks as fixed effects
model6 <- lmer(Shannon ~ Protein_level + Protein_source + Age_in_weeks + Country + (1 | Isolation_source), data = alpha_div_df)
# Step 6: Summarize Models
cat("Model Summaries:\n")
## Model Summaries:
# Step 7: Model Comparison
cat("Likelihood Ratio Test for All Models:\n")
## Likelihood Ratio Test for All Models:
print(anova(model1, model2, model3, model4, model5, model6))
## Data: alpha_div_df
## Models:
## model1: Shannon ~ Protein_source + (1 | Isolation_source)
## model4: Shannon ~ Protein_source + Protein_level + (1 | Isolation_source)
## model2: Shannon ~ Protein_source + Protein_level + Country + (1 | Isolation_source)
## model3: Shannon ~ Protein_source + Protein_level + Age_in_weeks + Country + (1 | Isolation_source)
## model5: Shannon ~ Protein_source + Protein_level + Age_in_weeks + Country + (1 | Isolation_source)
## model6: Shannon ~ Protein_level + Protein_source + Age_in_weeks + Country + (1 | Isolation_source)
##        npar    AIC    BIC  logLik deviance   Chisq Df Pr(>Chisq)    
## model1    4 484.57 497.49 -238.28   476.57                          
## model4    5 485.61 501.77 -237.81   475.61  0.9585  1     0.3276    
## model2    6 469.24 488.63 -228.62   457.24 18.3676  1  1.821e-05 ***
## model3    7 470.82 493.43 -228.41   456.82  0.4266  1     0.5137    
## model5    7 470.82 493.43 -228.41   456.82  0.0000  0               
## model6    7 470.82 493.43 -228.41   456.82  0.0000  0               
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Step 18: ROC Curve Plot of Alpha Diversity Metrics

alpha_div <- data.frame(
  Observed = rowSums(otu_table_data > 0),  # Count of observed taxa
  Shannon = diversity(otu_table_data, index = "shannon"),  # Shannon index
  Simpson = diversity(otu_table_data, index = "simpson"),  # Simpson index
  ACE = apply(otu_table_data, 1, function(x) sum(x > 0) / mean(x[x > 0])),  # ACE approximation
  Chao1 = apply(otu_table_data, 1, function(x) {  # Chao1 index
    S_obs <- sum(x > 0)
    singletons <- sum(x == 1)
    doubletons <- sum(x == 2)
    if (doubletons > 0) S_obs + (singletons^2) / (2 * doubletons) else NA
  })
)

# Step 4: Combine Alpha Diversity Metrics with Metadata
metadata <- data.frame(sample_data(Protein_filt))
if (!"Protein_source" %in% colnames(metadata)) stop("Protein_source column missing in metadata.")

# Match alpha diversity metrics to metadata samples
alpha_div <- alpha_div[rownames(metadata), , drop = FALSE]
alpha_div <- cbind(metadata, alpha_div)

# Step 5: Prepare for ROC Curve Analysis
if (length(unique(alpha_div$Protein_source)) != 2) {
  stop("Protein_source must have exactly two groups for ROC analysis.")
}

# Define metrics and colors
metrics <- c("Observed", "Shannon", "Simpson", "ACE", "Chao1")
colors <- c("red", "blue", "green", "purple", "orange")
legend_labels <- c()

# Step 6: Plot Multi-Curve ROC with AUC and P-Values
plot(NULL, xlim = c(0, 1), ylim = c(0, 1),
     xlab = "False Positive Rate", ylab = "True Positive Rate",
     main = "ROC Curves for Alpha Diversity Metrics (Protein Source)", type = "n")
lines(c(0, 1), c(1, 0), col = "gray", lty = 2)  # Diagonal reference line

for (i in seq_along(metrics)) {
  metric <- metrics[i]
  if (any(is.na(alpha_div[[metric]]) | is.infinite(alpha_div[[metric]]))) {
    warning(paste("Skipping", metric, "due to invalid values"))
    next
  }

  # ROC and AUC calculation
  roc_result <- roc(alpha_div$Protein_source, alpha_div[[metric]])
  auc_value <- auc(roc_result)

  # Wilcoxon test for significance
  p_value <- wilcox.test(alpha_div[[metric]] ~ alpha_div$Protein_source)$p.value

  # Add curve and update legend
  lines(roc_result, col = colors[i], lwd = 2)
  legend_labels <- c(legend_labels, paste(metric, "(AUC =", round(auc_value, 3), ", p =", signif(p_value, 3), ")"))
}

# Add legend to the plot
legend("bottomleft", legend = legend_labels, col = colors, lwd = 2, title = "Alpha Diversity Metrics")

cat("ROC curve successfully plotted.\n")

Step 19: Alpha Diversity Metrics Plot

# Check sample data and group counts
table(sample_data(Protein_filt)$Protein_source)  # Display counts for each group in Protein_source
## 
## Animal  Plant 
##    155     32
# Plot alpha diversity metrics
alpha_diversity_plot <- Protein_filt %>%
  plot_richness(
    x = "Protein_source",  # Group by Protein_source
    measures = c("Shannon", "Chao1", "InvSimpson")  # Select alpha diversity measures
  ) +
  geom_boxplot(
    aes(fill = Protein_source), 
    outlier.shape = 16,          # Shape of outlier points
    outlier.size = 0.1,          # Size of outlier points
    outlier.alpha = 0.9,         # Transparency of outliers
    show.legend = FALSE          # Hide legend for boxplots
  ) +
  labs(
    x = "Protein Source", 
    y = "Alpha Diversity Index",
    title = "Alpha Diversity Across Protein Sources"
  ) +
  stat_compare_means(size = 4) +  # Add p-values for comparisons
  theme_classic() +               # Use a clean classic theme
  scale_fill_manual(values = c("Plant" = "blue", "Animal" = "red")) +  # Custom fill colors
  theme(
    strip.text = element_text(face = "bold", size = 12),  # Bold facet labels
    axis.title.x = element_text(face = "bold", size = 12),  # Bold x-axis title
    axis.title.y = element_text(face = "bold", size = 12),  # Bold y-axis title
    plot.title = element_text(face = "bold", size = 14, hjust = 0.5)  # Centered bold title
  )

# Display the plot
alpha_diversity_plot

Beta Diversity Index

Step 20: Principal Coordinate Analysis (PCoA) Based on Aitchison Distance

# Step 1: CLR Transformation
Protein_filt_clr <- transform(Protein_filt1, "clr")  # Centered log-ratio transformation

# Step 2: Calculate Aitchison Distance
otu_clr <- otu_table(Protein_filt_clr)  # Extract transformed OTU table
aitchison_dist <- dist(t(otu_clr), method = "euclidean")  # Aitchison distance

# Step 3: PERMANOVA Analysis
# Extract metadata as a data frame
metadata <- as(sample_data(Protein_filt1), "data.frame")

# Run PERMANOVA
permanova_res <- adonis2(aitchison_dist ~ Protein_source, data = metadata, permutations = 999)

# View the PERMANOVA results
print(permanova_res)
## Permutation test for adonis under reduced model
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = aitchison_dist ~ Protein_source, data = metadata, permutations = 999)
##           Df SumOfSqs      R2      F Pr(>F)    
## Model      1    47072 0.05496 10.758  0.001 ***
## Residual 185   809474 0.94504                  
## Total    186   856545 1.00000                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Extract R² and p-value
R2 <- round(permanova_res$R2[1], 3)  # Proportion of variance explained
p_value <- signif(permanova_res$`Pr(>F)`[1], 3)  # p-value

# Step 4: PCoA Ordination
ProteinPCA_aitchison <- cmdscale(aitchison_dist, k = 2, eig = TRUE)  # Perform classical MDS
PCoA_scores <- as.data.frame(ProteinPCA_aitchison$points)  # Extract PCoA scores
colnames(PCoA_scores) <- c("PCoA1", "PCoA2")
PCoA_scores$Protein_source <- metadata$Protein_source  # Add metadata for grouping

# Calculate explained variance for PCoA axes
eigenvalues <- ProteinPCA_aitchison$eig
positive_eigenvalues <- eigenvalues[eigenvalues > 0]
variance_explained <- positive_eigenvalues / sum(positive_eigenvalues) * 100

# Step 5: Plot PCoA with Annotated R² and p-value
Proteingraph_aitchison <- ggplot(PCoA_scores, aes(x = PCoA1, y = PCoA2, color = Protein_source)) +
  geom_point(size = 3) +  # Plot points
  stat_ellipse(aes(fill = Protein_source), alpha = 0.2, geom = "polygon") +  # Add confidence ellipses
  labs(
    title = "PCoA Ordination of Aitchison Distance",
    x = paste0("PCoA1 (", round(variance_explained[1], 2), "%)"),
    y = paste0("PCoA2 (", round(variance_explained[2], 2), "%)"),
    color = "Protein Source",
    fill = "Protein Source"
  ) +
  scale_color_manual(values = c("Plant" = "blue", "Animal" = "red")) +  # Custom colors
  scale_fill_manual(values = c("Plant" = "blue", "Animal" = "red")) +  # Custom colors
  annotate(
    "text",
    x = max(PCoA_scores$PCoA1) * 0.9,
    y = min(PCoA_scores$PCoA2) * 0.9,
    label = paste0("PERMANOVA R² = ", R2, "\n", "p-value = ", p_value),
    size = 4, hjust = 0.4, color = "black"
  ) +
  theme_classic2() +
  theme(
    legend.title = element_text(face = "bold"),
    axis.title = element_text(face = "bold", size = 12),
    axis.text = element_text(size = 10)
  )

# Display the plot
Proteingraph_aitchison

Step 21: ANOSIM Analysis based on Aitchison Distance

# CLR Transformation and Aitchison Distance (from previous steps)
Protein_filt_clr <- transform(Protein_filt1, "clr")  # CLR transformation
aitchison_dist <- dist(t(otu_table(Protein_filt_clr)), method = "euclidean")  # Aitchison distance

# Extract metadata
metadata <- as(sample_data(Protein_filt1), "data.frame")
grouping_var <- metadata$Protein_source  # Plant/Animal grouping

# Step 1: Run ANOSIM
anosim_res <- anosim(aitchison_dist, grouping_var, permutations = 999)
R_value <- round(anosim_res$statistic, 3)
p_value <- signif(anosim_res$signif, 3)

# Step 2: Prepare Dissimilarity Data Frame
dissimilarity_df <- as.data.frame(as.table(as.matrix(aitchison_dist)))
colnames(dissimilarity_df) <- c("Sample1", "Sample2", "Distance")

# Add grouping information
dissimilarity_df$Group1 <- metadata$Protein_source[match(dissimilarity_df$Sample1, rownames(metadata))]
dissimilarity_df$Group2 <- metadata$Protein_source[match(dissimilarity_df$Sample2, rownames(metadata))]

# Define comparison types
dissimilarity_df$Type <- ifelse(
  dissimilarity_df$Group1 == dissimilarity_df$Group2,
  paste0("Within-", dissimilarity_df$Group1),
  "Between-Groups"
)

# Remove duplicates and self-comparisons
dissimilarity_df <- dissimilarity_df[dissimilarity_df$Sample1 != dissimilarity_df$Sample2, ]

# Step 3: Normalize Distances
dissimilarity_df$Normalized_Distance <- scales::rescale(dissimilarity_df$Distance, to = c(0, 1))

# Step 4: Calculate Medians
medians <- aggregate(Normalized_Distance ~ Type, data = dissimilarity_df, FUN = median)

# Step 5: Create ANOSIM Boxplot
anosim_plot <- ggplot(dissimilarity_df, aes(x = Type, y = Normalized_Distance, fill = Type)) +
  geom_boxplot(alpha = 0.7, outlier.shape = NA) +  # Boxplot without outlier dots
  stat_summary(fun = "median", geom = "point", size = 4, color = "black", shape = 18) +  # Median points
  stat_summary(fun = "median", geom = "text", aes(label = round(..y.., 3)), vjust = -1.0, size = 3.5) +  # Median labels
  scale_y_continuous(limits = c(0, 1)) +  # Normalize distance axis
  annotate("text", x = 2, y = 0.95, label = paste0("ANOSIM: R = ", R_value, ", p = ", p_value), size = 4, fontface = "bold") +  # ANOSIM results
  labs(
    x = "Group Comparison",
    y = "Aitchison Dissimilarity Distance"
  ) +
  scale_fill_manual(values = c("Within-Plant" = "blue", "Within-Animal" = "red", "Between-Groups" = "purple")) +  # Custom colors
  theme_classic() +
  theme(
    axis.title.x = element_text(face = "bold", size = 12),
    axis.text.x = element_text(size = 10),
    axis.title.y = element_text(face = "bold", size = 12),
    legend.text = element_text(size = 10),
    legend.title = element_text(face = "bold", size = 12),
    axis.text.y = element_text(size = 10)
  )

# Display the Plot
anosim_plot

Random Forest Algorithm

Step 22: Random Forest Data Splitting and Prediction

set.seed(28)
predictprotein <- t(otu_table(Protein_filt))
  typeres <- sample_data(Protein_filt)$Protein_source
  rfprot <- data.frame(typeres, predictprotein)

  # Split data
trainIndex <- createDataPartition(rfprot$typeres, p = 0.7, list = FALSE)
trainprot <- rfprot[trainIndex, ]   # Subset the data for training
testprot <-  rfprot[-trainIndex, ]   # Subset the data for testing
cat("Training Set Size:", nrow(trainprot), "\n")
## Training Set Size: 132
cat("Test Set Size:", nrow(testprot), "\n")
## Test Set Size: 55
# Fit the Random Forest model using the training set
typeclassval <- rfsrc(typeres ~ ., data = trainprot, ntree = 1000, nodesize = 1, 
                      save.memory = TRUE, forest = TRUE)
typeclassval
##                          Sample size: 132
##            Frequency of class labels: 109, 23
##                      Number of trees: 1000
##            Forest terminal node size: 1
##        Average no. of terminal nodes: 6.661
## No. of variables tried at each split: 101
##               Total no. of variables: 10123
##        Resampling used to grow trees: swor
##     Resample size used to grow trees: 83
##                             Analysis: RF-C
##                               Family: class
##                       Splitting rule: gini *random*
##        Number of random split points: 10
##                     Imbalanced ratio: 4.7391
##                    (OOB) Brier score: 0.02295572
##         (OOB) Normalized Brier score: 0.09182289
##                            (OOB) AUC: 0.99680893
##                       (OOB) Log-loss: 0.0752792
##                         (OOB) PR-AUC: 0.98661547
##                         (OOB) G-mean: 0.98147988
##    (OOB) Requested performance error: 0.03030303, 0.03669725, 0
## 
## Confusion matrix:
## 
##           predicted
##   observed Animal Plant class.error
##     Animal    105     4      0.0367
##     Plant       0    23      0.0000
## 
##       (OOB) Misclassification rate: 0.03030303
# Make predictions on the test set
predictions <- predict(typeclassval, newdata = testprot)$class
# Extract actual classes from the test set
true_labels <- testprot$typeres
# Create confusion matrix manually using table()
conf_table <- table(Predicted = predictions, Actual = true_labels)
conf_table
##          Actual
## Predicted Animal Plant
##    Animal     45     1
##    Plant       1     8
# Create confusion matrix
conf_matrix <- table(Predicted = predictions, Actual = true_labels)

# Calculate accuracy
accuracy_random <- sum(diag(conf_matrix)) / sum(conf_matrix)
cat("Accuracy on test data:", round(accuracy_random * 100, 2), "%\n")
## Accuracy on test data: 96.36 %
# Convert confusion table to a data frame for ggplot visualization
conf_df <- as.data.frame(conf_table)

# Calculate total number of predictions for accuracy calculation
total_predictions <- sum(conf_table)
total_predictions
## [1] 55
# Calculate the class-specific accuracy for each cell
conf_df$Accuracy <- (conf_df$Freq / total_predictions) * 100

# Create a label for each cell that contains both the count and percentage
conf_df$Label <- paste0(conf_df$Freq, " (", round(conf_df$Accuracy, 1), "%)")
# Plot the confusion matrix using ggplot2
confusion_plot <- ggplot(data = conf_df, aes(x = Predicted, y = Actual, fill = Freq)) +
  geom_tile(color = "white") +
  geom_text(aes(label = Label), color = "black") +  # Add the text label with count and accuracy
  scale_fill_gradient(low = "white", high = "blue") +
  labs(title = "Test data Confusion Matrix",
       x = "Predicted",
       y = "Actual",
       fill = "Frequency") +
  theme_minimal()  +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    plot.title = element_text(size = 14, face = "bold"),       # Reduce title label size
    axis.title.x = element_text(size = 12, face = "bold"),     # Increase x-axis label size
    axis.title.y = element_text(size = 12, face = "bold")      # Increase y-axis label size
  )
confusion_plot

Step 23: Mean Decrease Accuracy of Top Taxa

# Make Training Dataset
predict <- t(otu_table(Protein_filt))
dim(predict)
## [1]   187 10123
# Create response variable 
# Create response variable for samples with class as "Protein_source"
res <- sample_data(Protein_filt)$Protein_source


# Combine them into 1 data frame
Protein_rf <- data.frame(res, predict)
dim(Protein_rf) #855 samples with 711 OTUs
## [1]   187 10124
set.seed(2000)

proteinclass <- randomForest(res ~ ., data=Protein_rf, ntree = 1000, importance = TRUE)
print(proteinclass)
## 
## Call:
##  randomForest(formula = res ~ ., data = Protein_rf, ntree = 1000,      importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 1000
## No. of variables tried at each split: 100
## 
##         OOB estimate of  error rate: 2.14%
## Confusion matrix:
##        Animal Plant class.error
## Animal    154     1 0.006451613
## Plant       3    29 0.093750000
# Extract importance scores
impcl <- randomForest::importance(proteinclass)
impcl_df <- data.frame(OTUID = rownames(impcl), impcl)

# Remove 'X' prefix from 'OTUID'y
impcl_df$OTUID <- gsub("X", "", impcl_df$OTUID)

# Prepare the OTU dataframe with taxonomic information
otu_df <- as.data.frame(tax_table(Protein_filt))
otu_df$OTUID <- rownames(otu_df)

# Assuming impcl_df is your data frame with taxa and their MDA values
positive_impact_taxa <- impcl_df %>%
  filter(MeanDecreaseAccuracy > 0) %>%
  arrange(desc(MeanDecreaseAccuracy)) %>%
  top_n(10, MeanDecreaseAccuracy)

# Filter out taxa with negative MDA values
positive_impact_taxa <- impcl_df[impcl_df$MeanDecreaseAccuracy > 0, ]

# Order by MDA in descending order and select the top 10
positive_impact_taxa <- positive_impact_taxa[order(-positive_impact_taxa$MeanDecreaseAccuracy), ]
top_10_positive_taxa <- head(positive_impact_taxa, 10)

# Merge importance scores with taxonomic information
imp_merged <- merge(top_10_positive_taxa, otu_df, by = "OTUID")
imp_merged <- imp_merged %>% tidyr::replace_na(list(Kingdom = "Unassigned",
                                                    Phylum = "Unassigned",
                                                    Class = "Unassigned",
                                                    Order = "Unassigned",
                                                    Family = "Unassigned",
                                                    Genus = "Unassigned",
                                                    Species = "Unassigned"))
#Graph itself for top 10 OTUs
colors <- brewer.pal(12, "Paired")
ggplot(imp_merged, aes(x = reorder(Genus, MeanDecreaseAccuracy, decreasing = TRUE), y = MeanDecreaseAccuracy, fill = Family)) +
  geom_bar(stat = "identity", color = "black") +
  coord_flip() +
  theme_classic() +
  theme(legend.position = "top") +
  scale_color_manual(values = colors) +
  labs(x = "Top Taxa for classifying \n  Plant vs Animal Protein",
       y = "Mean Decrease Accuracy (%)",
       fill = "")

LINEAR DISCRIMINANT ANALYSIS

Step 24: LEfSe analysis

lefse_results_file <- "lefse_results.rds"

if (!file.exists(lefse_results_file)) {
  lefse <- run_lefse(
    Protein_filt,
    group = "Protein_source",   # Metadata column for grouping
    wilcoxon_cutoff = 0.01,
    lda_cutoff = 3,
    multigrp_strat = TRUE,
    kw_cutoff = 0.01
  )
  saveRDS(lefse, lefse_results_file)
  cat("LEfSe analysis completed and results saved.\n")
} else {
  lefse <- readRDS(lefse_results_file)
  cat("LEfSe results loaded.\n")
}
## LEfSe results loaded.
# Prepare marker data for plotting
marker_data <- data.frame(
  feature = marker_table(lefse)$feature,
  enrich_group = as.character(marker_table(lefse)$enrich_group),
  ef_lda = marker_table(lefse)$ef_lda,
  pvalue = marker_table(lefse)$pvalue
)

# Process marker data
marker_data <- marker_data %>%
  mutate(
    genus = sub(".*\\|", "", feature),           # Extract genus name
    LDA_Adjusted = ifelse(enrich_group == "Plant", ef_lda, -ef_lda) # Adjust LDA scores
  ) %>%
  filter(LDA_Adjusted != 0) %>%                  # Exclude taxa with zero LDA score
  arrange(enrich_group, LDA_Adjusted)           # Sort by group and LDA score

# Create the bar plot
lefse_bar_plot <- ggplot(marker_data, aes(x = reorder(genus, LDA_Adjusted), y = LDA_Adjusted, fill = enrich_group)) +
  geom_bar(stat = "identity", width = 0.8) +
  geom_text(
    data = marker_data %>% filter(enrich_group == "Animal"),
    aes(label = genus, y = 0.1),
    hjust = 0, size = 3, color = "black"
  ) +
  geom_text(
    data = marker_data %>% filter(enrich_group == "Plant"),
    aes(label = genus, y = -0.1),
    hjust = 1, size = 3, color = "black"
  ) +
  coord_flip() +
  theme_classic() +
  theme(
    axis.text.y = element_blank(),
    axis.ticks.y = element_blank(),
    plot.title = element_text(face = "bold", hjust = 0.5)
  ) +
  labs(
    y = "LDA score (log10)",
    x = NULL
  ) +
  scale_y_continuous(
    limits = c(-5, 6),
    breaks = seq(-5, 6, by = 1)
  ) +
  scale_fill_manual(values = c("Plant" = "blue", "Animal" = "red"), name = "Group")

# Print the plot
print(lefse_bar_plot)

Step 25: Plot Cladogram

plot_cladogram(lefse, color = c("blue", "red"), clade_label_level = 4)

MICOM Analysis

Step 26: Import Metabolic Flux files

Metabolic_flux <- import_qiime_sample_data("META_flux1.tsv")
dim(Metabolic_flux)
## [1]  86 188
Metadf <- data.frame(Metabolic_flux)

Metadf <- Metadf[,-1]
dim(Metadf)
## [1]  86 187

Step 27: Random Forest Protein source classification based on Metabolic Flux

#Make Training Dataset
predict <- t(Metadf)
dim(predict)
## [1] 187  86
# We have 116 samples representing 353 pathways
dim(sample_data(Protein_filt))
## [1] 187  26
res <- as.factor(sample_data(Protein_filt)$Protein_source)
res <- sample_data(Metadf)$Protein_source
dim(res)
## NULL
head(sample_data(Protein_filt)$Protein_source)
## [1] Plant Plant Plant Plant Plant Plant
## Levels: Animal Plant
res <- sample_data(Protein_filt)$Protein_source
res <- as.factor(res)  # Convert to factor
# Create Response Variable and Dataset
rf <- data.frame(res, predict)
# Random Forest Model (OOB)
class <- rfsrc(res ~ ., data = rf, ntree = 1000, 
               nodesize = 1, save.memory = TRUE, seed = 8)

imp <- vimp(class, importance = "permute")$importance

# Turn into dataframe and rename the one column into Var Imp
impdf <- data.frame(imp)
colnames(impdf) <- c("all", "Animal", "Plant")

# Add the pathways into the data as its own column
impdf2 <- impdf %>% add_column(Metabolite = rownames(data.frame(Metadf)))

# Arrange the variable importance values
arrangedimp <- arrange(impdf2, desc(all))

# Take only the non-zero VarImp
impfeat <- arrangedimp[arrangedimp$all != 0, ]
dim(impfeat)
## [1] 66  4
# Filter top 20
top10impfeat <- arrange(impfeat, desc(all))[1:20, ]

# Graph top 10 metabolites
colors <- brewer.pal(10, "Paired")

ggplot(top10impfeat, aes(x = reorder(Metabolite, all, decreasing = TRUE), y = all, fill = Metabolite)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  theme_classic() +
  theme(legend.position = "top") +
  scale_color_manual(values = colors) +
  labs(x = "Top Metabolites for Classifying\n Diet Type",
       y = "OOB Variable Importance",
       fill = "")

# Validation Model
ind <- sample(2, nrow(rf), replace = TRUE, prob = c(0.7, 0.3))
train <- rf[ind == 1, ]
test <- rf[ind == 2, ]

# Generate Initial Validation Classification Model
set.seed(28)
typeclassval <- rfsrc(res ~ ., data = train, ntree = 1000, nodesize = 1,
                      save.memory = TRUE, seed = 28, forest = TRUE)

# Perform Validation
rfttpredprot <- predict.rfsrc(typeclassval, test, seed = 28)

# Variable Importance for Validation
impvalp2 <- vimp(typeclassval, importance = "permute")$importance
impdfvalp2 <- data.frame(impvalp2)
colnames(impdfvalp2)[which(names(impdfvalp2) == "all")] <- "VarImp"

impdf2valp2 <- impdfvalp2 %>% add_column(Metabolite = rownames(data.frame(Metadf)))

# Arrange the variable importance values
arrangedimpvalp2 <- dplyr::arrange(impdf2valp2, desc(VarImp))
impfeatvalp2 <- arrangedimpvalp2[arrangedimpvalp2$VarImp != 0, ]
dim(impfeatvalp2)
## [1] 65  4
# Graph the top 10 pathways
top10impfeatvalp2 <- impfeatvalp2[1:20, ]

colors <- brewer.pal(10, "Paired")

ggplot(top10impfeatvalp2, aes(x = reorder(Metabolite, VarImp, increasing = TRUE), y = VarImp, fill = Metabolite)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  theme_classic() +
  theme(legend.position = "right") +
  scale_color_manual(values = colors) +
  labs(x = "Metabolic flux for classifying Protein Source",
       y = "Validation Variable Importance",
       fill = "")

Step 28: Spearman Correlation of RF Metabolic Flux and Microbial Abundance

flux_metadata <- read.csv("META_flux.csv")
flux_metadata <- flux_metadata[!duplicated(flux_metadata[, 1]), ]
rownames(flux_metadata) <- flux_metadata[, 1]
flux_metadata <- flux_metadata[, -1]  # Drop column used for row names

# Load taxa abundance
taxa_abundance <- read.csv("microbial_abundance.csv", row.names = 1)

# Load metadata for Protein source
sample_metadata <- data.frame(
  Sample = colnames(flux_metadata),
  Protein_source = c("Plant","Animal") 
)

### Compute Correlation and P-values
cor.mtest <- function(mat1, mat2, method = "spearman") {
  p_mat <- matrix(NA, nrow = ncol(mat1), ncol = ncol(mat2))
  for (i in 1:ncol(mat1)) {
    for (j in 1:ncol(mat2)) {
      test <- cor.test(mat1[, i], mat2[, j], method = method)
      p_mat[i, j] <- test$p.value
    }
  }
  return(list(p = p_mat))
}

cor_matrix <- cor(t(taxa_abundance), flux_metadata, method = "spearman")
p_values <- cor.mtest(t(taxa_abundance), flux_metadata, method = "spearman")$p

### Filter Significant Taxa and Add Markers
significant_taxa <- apply(p_values, 1, function(row) any(row < 0.05))
cor_matrix_significant <- cor_matrix[significant_taxa, ]
p_values_significant <- p_values[significant_taxa, ]
significant_markers <- ifelse(p_values_significant < 0.05, "*", "")

### Top 50 Taxa Analysis
max_cor <- apply(abs(cor_matrix_significant), 1, max)
ranked_taxa <- names(sort(max_cor, decreasing = TRUE))
top_50_taxa <- ranked_taxa[1:50]

cor_matrix_top50 <- cor_matrix_significant[top_50_taxa, , drop = FALSE]
p_values_top50 <- p_values_significant[top_50_taxa, , drop = FALSE]
significant_markers_top50 <- ifelse(p_values_top50 < 0.05, "*", "")
# Calculate Total Metabolic Flux for Each Metabolite
total_flux <- rowSums(flux_metadata)

# Create DataFrame of Metabolites and Their Total Flux
metabolite_flux <- data.frame(
  Metabolite = rownames(flux_metadata),
  Total_Flux = total_flux
)

# Match metabolites to their Protein source based on sample association
metabolite_flux <- metabolite_flux %>%
  mutate(Protein_source = apply(
    flux_metadata[Metabolite, ], 1,
    function(row) {
      # Determine the dominant Protein source by summing fluxes per group
      source_sums <- tapply(row, sample_metadata$Protein_source, sum)
      names(which.max(source_sums))  # Assign the Protein source with the highest total flux
    }
  ))

# Filter the correlation matrix and annotations for the matched metabolites
cor_matrix_top50 <- cor_matrix_top50[metabolite_flux$Metabolite, , drop = FALSE]
annotation_col <- data.frame(
  Protein_source = metabolite_flux$Protein_source
)
rownames(annotation_col) <- metabolite_flux$Metabolite

# Define Annotation Colors
annotation_colors <- list(
  Protein_source = c("Plant" = "green", "Animal" = "orange")
)

# Order Columns by Protein Source
ordered_cols <- order(annotation_col$Protein_source)
cor_matrix_top50 <- cor_matrix_top50[, ordered_cols, drop = FALSE]
annotation_col <- annotation_col[ordered_cols, , drop = FALSE]

### Assign Genus and Phylum Annotations Dynamically
taxonomy <- as.data.frame(tax_table(Protein_filt))  # Replace with your phyloseq object
taxonomy <- taxonomy %>%
  tibble::rownames_to_column(var = "OTU") %>%
  tidyr::replace_na(list(Genus = "Unknown", Phylum = "Unknown"))

otu_to_genus <- taxonomy$Genus
names(otu_to_genus) <- taxonomy$OTU
otu_to_phylum <- taxonomy$Phylum
names(otu_to_phylum) <- taxonomy$OTU

rownames(cor_matrix_top50) <- otu_to_genus[rownames(cor_matrix_top50)]
annotation_row <- data.frame(Phylum = otu_to_phylum[rownames(cor_matrix_top50)])
rownames(annotation_row) <- rownames(cor_matrix_top50)

# Define Phylum Colors
phylum_colors <- setNames(
  rainbow(length(unique(annotation_row$Phylum))),
  unique(annotation_row$Phylum)
)

annotation_colors <- c(
  annotation_colors,
  list(Phylum = phylum_colors)
)
### Generate Heatmap
pheatmap(
  cor_matrix_top50,
  color = colorRampPalette(c("blue", "white", "red"))(100),
  display_numbers = significant_markers_top50,
  cluster_rows = TRUE,
  cluster_cols = FALSE,
  main = "Heatmap: Top 50 Taxa vs Metabolite Flux (Spearman Correlation)",
  annotation_col = annotation_col,
  annotation_row = annotation_row,
  annotation_colors = annotation_colors
)

Step 29: Bar plots of selected metabolic Flux based on cardiometabolic profile

# Load datasets
flux <- read.csv("META_exchange_fluxes_EXPORT.csv", header = TRUE)
df.flux <- read.csv("FINAL_Metadata.csv", header = TRUE)
df.reduced <- read.csv("META_reduced.csv", header = TRUE)

# Merge datasets
flux_df_merged <- merge(flux, df.flux, by = "SampleID")
flux.merged <- merge(flux_df_merged, df.reduced, by = "SampleID")

# Clean taxon names by removing "g__" prefix
flux.merged$taxon <- sub("^g__", "", flux.merged$taxon)

# Filter for "isobut_m", "isocapr_m", "isoval_m", "4hphac_m", "2hyoxplac_m" metabolites
Metabolite <- subset(flux.merged, metabolite %in% c("isobut_m", "isocapr_m", "isoval_m", "4hphac_m", "2hyoxplac_m"))

# Calculate maximum y-values for each metabolite
label_positions <- Metabolite %>%
  group_by(metabolite) %>%
  summarize(label_y = max(flux, na.rm = TRUE) * 1.05, .groups = "drop")

# Merge label positions into the dataset
Metabolite <- Metabolite %>%
  left_join(label_positions, by = "metabolite")

# Plot metabolic flux for selected metabolites
ggplot(Metabolite, aes(x = Protein_source, y = flux, fill = Protein_source)) +
  geom_bar_pattern(
    stat = "summary", 
    fun = "sum", 
    position = "dodge",
    pattern = "crosshatch",  # Add crosshatch pattern for checked look
    pattern_density = 0.3,   # Adjust density of the pattern
    pattern_color = "black"   # Outline color of the pattern
  ) +
  stat_compare_means(
    method = "kruskal.test", 
    label = "p.format",       # Display the actual p-value
    size = 3,                 # Adjust text size
    fontface = "bold",        # Make the p-value bold
    label.y = max(Metabolite$flux) *55.05  # Position the p-value slightly above the highest flux value
  ) +
  facet_wrap(~ metabolite, scales = "free_y", labeller = labeller(.default = label_value)) +  # Remove "metabolite:" prefix
  theme_classic2() +
  labs(
    x = "Protein Source", 
    y = "Total Metabolic Flux [mmol/h]"
  ) +
  theme(
    axis.text.x = element_text(face = "bold"),
    axis.text.y = element_text(face = "bold"),
    axis.title.x = element_text(face = "bold"),
    axis.title.y = element_text(face = "bold"),
    legend.position = "",  # Show legend
    strip.text = element_text(face = "bold")  # Make facet labels bold
  ) +
  scale_fill_manual(
    values = c("Plant" = "blue", "Animal" = "red"),  # Fill color mapping
    guide = guide_legend(override.aes = list(pattern = "crosshatch"))  # Legend pattern
  )

# Filter for "tma_m", "so3_m", "h2s_m" metabolites
Metabolite <- subset(flux.merged, metabolite %in% c("tma_m", "so3_m", "h2s_m"))

# Calculate maximum y-values for each metabolite
label_positions <- Metabolite %>%
  group_by(metabolite) %>%
  summarize(label_y = max(flux, na.rm = TRUE) * 1.05, .groups = "drop")

# Merge label positions into the dataset
Metabolite <- Metabolite %>%
  left_join(label_positions, by = "metabolite")
# Plot metabolic flux for selected metabolites
ggplot(Metabolite, aes(x = Protein_source, y = flux, fill = Protein_source)) +
  geom_bar_pattern(
    stat = "summary", 
    fun = "sum", 
    position = "dodge",
    pattern = "crosshatch",  # Add crosshatch pattern for checked look
    pattern_density = 0.3,   # Adjust density of the pattern
    pattern_color = "black"   # Outline color of the pattern
  ) +
  stat_compare_means(
    method = "kruskal.test", 
    label = "p.format",       # Display the actual p-value
    size = 3,                 # Adjust text size
    fontface = "bold",        # Make the p-value bold
    label.y = max(Metabolite$flux) *120.05  # Position the p-value slightly above the highest flux value
  ) +
  facet_wrap(~ metabolite, scales = "free_y", labeller = labeller(.default = label_value)) +  # Remove "metabolite:" prefix
  theme_classic2() +
  labs(
    x = "Protein Source", 
    y = "Total Metabolic Flux [mmol/h]"
  ) +
  theme(
    axis.text.x = element_text(face = "bold"),
    axis.text.y = element_text(face = "bold"),
    axis.title.x = element_text(face = "bold"),
    axis.title.y = element_text(face = "bold"),
    legend.position = "",  # Show legend
    strip.text = element_text(face = "bold")  # Make facet labels bold
  ) +
  scale_fill_manual(
    values = c("Plant" = "blue", "Animal" = "red"),  # Fill color mapping
    guide = guide_legend(override.aes = list(pattern = "crosshatch"))  # Legend pattern
  )

# Filter for "hdca_m", "ocdca_m" metabolites
Metabolite <- subset(flux.merged, metabolite %in% c("hdca_m", "ocdca_m"))

# Calculate maximum y-values for each metabolite
label_positions <- Metabolite %>%
  group_by(metabolite) %>%
  summarize(label_y = max(flux, na.rm = TRUE) * 1.05, .groups = "drop")

# Merge label positions into the dataset
Metabolite <- Metabolite %>%
  left_join(label_positions, by = "metabolite")
# Plot metabolic flux for selected metabolites
ggplot(Metabolite, aes(x = Protein_source, y = flux, fill = Protein_source)) +
  geom_bar_pattern(
    stat = "summary", 
    fun = "sum", 
    position = "dodge",
    pattern = "crosshatch",  # Add crosshatch pattern for checked look
    pattern_density = 0.3,   # Adjust density of the pattern
    pattern_color = "black"   # Outline color of the pattern
  ) +
  stat_compare_means(
    method = "kruskal.test", 
    label = "p.format",       # Display the actual p-value
    size = 3,                 # Adjust text size
    fontface = "bold",        # Make the p-value bold
    label.y = max(Metabolite$flux) *40  # Position the p-value slightly above the highest flux value
  ) +
  facet_wrap(~ metabolite, scales = "free_y", labeller = labeller(.default = label_value)) +  # Remove "metabolite:" prefix
  theme_classic2() +
  labs(
    x = "Protein Source", 
    y = "Total Metabolic Flux [mmol/h]"
  ) +
  theme(
    axis.text.x = element_text(face = "bold"),
    axis.text.y = element_text(face = "bold"),
    axis.title.x = element_text(face = "bold"),
    axis.title.y = element_text(face = "bold"),
    legend.position = "",  # Show legend
    strip.text = element_text(face = "bold")  # Make facet labels bold
  ) +
  scale_fill_manual(
    values = c("Plant" = "blue", "Animal" = "red"),  # Fill color mapping
    guide = guide_legend(override.aes = list(pattern = "crosshatch"))  # Legend pattern
  )

Step 30: Spearman Correlation of Metabolites and Microbial Abundance

flux_metadata <- read.csv("BCFA_ABUNDANCE1.csv")
flux_metadata <- flux_metadata[!duplicated(flux_metadata[, 1]), ]
rownames(flux_metadata) <- flux_metadata[, 1]
flux_metadata <- flux_metadata[, -1]  # Drop column used for row names

# Load taxa abundance
taxa_abundance <- read.csv("microbial_abundance.csv", row.names = 1)

# Load metadata for Protein source
sample_metadata <- data.frame(
  Sample = colnames(flux_metadata),
  Protein_source = c("Plant","Animal") 
)

### Compute Correlation and P-values
cor.mtest <- function(mat1, mat2, method = "spearman") {
  p_mat <- matrix(NA, nrow = ncol(mat1), ncol = ncol(mat2))
  for (i in 1:ncol(mat1)) {
    for (j in 1:ncol(mat2)) {
      test <- cor.test(mat1[, i], mat2[, j], method = method)
      p_mat[i, j] <- test$p.value
    }
  }
  return(list(p = p_mat))
}

cor_matrix <- cor(t(taxa_abundance), flux_metadata, method = "spearman")
p_values <- cor.mtest(t(taxa_abundance), flux_metadata, method = "spearman")$p

### Filter Significant Taxa and Add Markers
significant_taxa <- apply(p_values, 1, function(row) any(row < 0.05))
cor_matrix_significant <- cor_matrix[significant_taxa, ]
p_values_significant <- p_values[significant_taxa, ]
significant_markers <- ifelse(p_values_significant < 0.05, "*", "")

### Top 50 Taxa Analysis
max_cor <- apply(abs(cor_matrix_significant), 1, max)
ranked_taxa <- names(sort(max_cor, decreasing = TRUE))
top_50_taxa <- ranked_taxa[1:50]

cor_matrix_top50 <- cor_matrix_significant[top_50_taxa, , drop = FALSE]
p_values_top50 <- p_values_significant[top_50_taxa, , drop = FALSE]
significant_markers_top50 <- ifelse(p_values_top50 < 0.05, "*", "")
# Calculate Total Metabolic Flux for Each Metabolite
total_flux <- rowSums(flux_metadata)

# Create DataFrame of Metabolites and Their Total Flux
metabolite_flux <- data.frame(
  Metabolite = rownames(flux_metadata),
  Total_Flux = total_flux
)

# Match metabolites to their Protein source based on sample association
metabolite_flux <- metabolite_flux %>%
  mutate(Protein_source = apply(
    flux_metadata[Metabolite, ], 1,
    function(row) {
      # Determine the dominant Protein source by summing fluxes per group
      source_sums <- tapply(row, sample_metadata$Protein_source, sum)
      names(which.max(source_sums))  # Assign the Protein source with the highest total flux
    }
  ))

# Filter the correlation matrix and annotations for the matched metabolites
cor_matrix_top50 <- cor_matrix_top50[metabolite_flux$Metabolite, , drop = FALSE]
annotation_col <- data.frame(
  Protein_source = metabolite_flux$Protein_source
)
rownames(annotation_col) <- metabolite_flux$Metabolite

# Define Annotation Colors
annotation_colors <- list(
  Protein_source = c("Plant" = "green", "Animal" = "orange")
)

# Order Columns by Protein Source
ordered_cols <- order(annotation_col$Protein_source)
cor_matrix_top50 <- cor_matrix_top50[, ordered_cols, drop = FALSE]
annotation_col <- annotation_col[ordered_cols, , drop = FALSE]

### Assign Genus and Phylum Annotations Dynamically
taxonomy <- as.data.frame(tax_table(Protein_filt))  # Replace with your phyloseq object
taxonomy <- taxonomy %>%
  tibble::rownames_to_column(var = "OTU") %>%
  tidyr::replace_na(list(Genus = "Unknown", Phylum = "Unknown"))

otu_to_genus <- taxonomy$Genus
names(otu_to_genus) <- taxonomy$OTU
otu_to_phylum <- taxonomy$Phylum
names(otu_to_phylum) <- taxonomy$OTU

rownames(cor_matrix_top50) <- otu_to_genus[rownames(cor_matrix_top50)]
annotation_row <- data.frame(Phylum = otu_to_phylum[rownames(cor_matrix_top50)])
rownames(annotation_row) <- rownames(cor_matrix_top50)

# Define Phylum Colors
phylum_colors <- setNames(
  rainbow(length(unique(annotation_row$Phylum))),
  unique(annotation_row$Phylum)
)

annotation_colors <- c(
  annotation_colors,
  list(Phylum = phylum_colors)
)
### Generate Heatmap
pheatmap(
  cor_matrix_top50,
  color = colorRampPalette(c("blue", "white", "red"))(100),
  display_numbers = significant_markers_top50,
  cluster_rows = TRUE,
  cluster_cols = FALSE,
  main = "Heatmap: Top 50 Taxa vs Metabolite Flux (Spearman Correlation)",
  annotation_col = annotation_col,
  annotation_row = annotation_row,
  annotation_colors = annotation_colors
)

Supplementary Analysis

Step 31: Venn Diagram of Core Microbiota at Phylum level

Protein_filt@tax_table %>% 
  data.frame() %>% 
  group_by(Phylum) %>% 
  summarise(n = length(Genus)) 
## # A tibble: 36 × 2
##    Phylum               n
##    <chr>            <int>
##  1 Acidobacteriota    183
##  2 Actinobacteriota   757
##  3 Armatimonadota      14
##  4 Bacteroidota      2982
##  5 Bdellovibrionota    21
##  6 Caldisericota        3
##  7 Calditrichota        2
##  8 Campylobacterota    44
##  9 Chloroflexi        236
## 10 Cloacimonadota       2
## # ℹ 26 more rows
bac_Protein_filt <- subset_taxa(Protein_filt, Kingdom == "d__Bacteria")

bac_Protein_df <- psmelt(bac_Protein_filt)

#Summarize taxa#
number_of_tax <- bac_Protein_df %>% 
  dplyr::filter(Abundance > 5) %>% 
  group_by(Protein_source == "Plant", Phylum) 

number_of_tax2 <- bac_Protein_df %>% 
  dplyr::filter(Abundance > 5) %>% 
  group_by(Protein_source == "Animal", Phylum) 

#To place the sets of genera in a list, we use 
venn_data <- list(Plant = c(number_of_tax$Phylum[number_of_tax$`Protein_source == "Plant"`]),
                  Animal = c(number_of_tax2$Phylum[number_of_tax2$`Protein_source == "Animal"`]))

# Plot the Venn diagram and specify colors for each set
ggvenn(
  venn_data,
  fill_color = c("blue", "red"),
  stroke_size = 0.5, set_name_size = 4
)

Step 32: Core Microbiota Composition at Phylum level

# Step 1: Filter and clean the phyloseq object
Protein_filt <- Protein_filt %>%
  tax_fix(unknowns = c("uncultured", "unclassified", "unknown", "NA")) %>%  # Replace unknown values
  tax_filter(min_prevalence = 1) %>%  # Retain taxa with at least 1 prevalence
  phyloseq_validate()  # Validate the phyloseq object
print(Protein_filt)
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 10123 taxa and 187 samples ]
## sample_data() Sample Data:       [ 187 samples by 26 sample variables ]
## tax_table()   Taxonomy Table:    [ 10123 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 10123 tips and 10061 internal nodes ]
# Step 2: Check and handle NAs in the Genus rank
if (any(is.na(tax_table(Protein_filt)[, "Genus"]))) {
  cat("NAs found in Genus rank. Replacing with 'Unknown'.\n")
  tax_table(Protein_filt)[, "Genus"] <- ifelse(
    is.na(tax_table(Protein_filt)[, "Genus"]),
    "Unknown",
    tax_table(Protein_filt)[, "Genus"]
  )
}

# Step 3: Create the bar plot
p1 <- Protein_filt %>%
  ps_select(Protein_source, Protein_level) %>%  # Select relevant metadata columns
  phyloseq::merge_samples(group = "Protein_source") %>%  # Merge samples by "Protein_source"
  comp_barplot(
    tax_level = "Phylum",  # Aggregation level for plotting
    n_taxa = 10,  # Number of taxa to display
    bar_width = 0.8,  # Adjust bar width
    position = position_dodge(width = 0.6),  # Adjust spacing between bars
    transform = function(x) x * 100  # Convert relative abundance to percentages
  ) +
  labs(
    x = "Protein Source",  # X-axis label
    y = "Relative Abundance (%)"  # Y-axis label indicating percentage
  ) +
  theme_classic() +  # Apply a clean theme
  theme(
    axis.text.x = element_text(face = "bold", size = 12),  # Customize x-axis text
    axis.text.y = element_text(face = "bold", size = 12),  # Customize y-axis text
    axis.title.x = element_text(face = "bold", size = 14),  # Bold x-axis title
    axis.title.y = element_text(face = "bold", size = 14),  # Bold y-axis title
    legend.text = element_text(size = 10),  # Adjust legend text size
    legend.title = element_text(face = "bold")  # Bold legend title
  )

# Step 4: Ensure proper ordering of Protein_source levels
if ("Protein_source" %in% colnames(p1$data)) {
  p1$data$Protein_source <- factor(p1$data$Protein_source, levels = c("Animal", "Plant"))
} else {
  stop("Error: 'Protein_source' column is missing in the plot data.")
}

# Step 5: Display the plot
print(p1)

Step 33: Random Forest Randomization for model Accuracy Validation

# Split data into training and test sets
set.seed(28)
trainIndex <- createDataPartition(typeres, p = 0.7, list = FALSE)
train_set <- rfprot[trainIndex, ]
test_set <- rfprot[-trainIndex, ]
# Number of samples in your test set
num_test_samples <- nrow(test_set)

# Generate random labels (1 or 2) for the test data
set.seed(42)  # Set seed for reproducibility
random_labels <- sample(c(1, 2), num_test_samples, replace = TRUE)

# Add these random labels to the test set
test_set$random_labels <- random_labels
# Make predictions on the test set
random_predictions <- predict(typeclassval, newdata = test_set)$class

# Create confusion matrix
conf_matrix <- table(Predicted = random_predictions, Actual = test_set$random_labels)

# Convert confusion table to a data frame for further analysis or visualization
conf_df <- as.data.frame(conf_matrix)

# Calculate accuracy
total_predictions <- sum(conf_matrix)
conf_df$Accuracy <- (conf_df$Freq / total_predictions) * 100

# Create a label for each cell with count and accuracy percentage
conf_df$Label <- paste0(conf_df$Freq, " (", round(conf_df$Accuracy, 1), "%)")

# View the confusion matrix with labels
print(conf_df)

# Calculate accuracy
accuracy_random <- sum(diag(conf_matrix)) / sum(conf_matrix)
cat("Accuracy on random test data:", round(accuracy_random * 100, 2), "%\n")


# Convert confusion table to a data frame for visualization
conf_df <- as.data.frame(conf_matrix)

# Create a label for each cell with count and accuracy percentage
conf_df$Accuracy <- (conf_df$Freq / sum(conf_matrix)) * 100
conf_df$Label <- paste0(conf_df$Freq, " (", round(conf_df$Accuracy, 1), "%)")

Plot confusion matrix

conf_matrix_plot <- ggplot(conf_df, aes(x = Predicted, y = Actual, fill = Freq)) +
  geom_tile(color = "white") +
  geom_text(aes(label = Label), color = "black", size = 7) + 
  scale_fill_gradient(low = "white", high = "blue") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Randomized data Confusion Matrix", x = "Predicted", y = "Actual") +
  theme(legend.position = "none")
conf_matrix_plot

Step 34: Log10 Abundance of RF Top 10 Taxa

top_genera <- unique(imp_merged$Genus) # Assuming the top 10 are already selected

Protein_genera <- Protein_filt %>%
  tax_glom(taxrank = "Genus") %>%                     # agglomerate at phylum level
  transform_sample_counts(function(x) {x/sum(x)} ) %>% # Transform to rel. abundance
  psmelt() %>%                                         # Melt to long format
  dplyr::arrange(Phylum)                                        # Sort data frame alphabetically by phylum

# Create a filtered version of Protein_genera that reflects what you aggregated. 
Protein_genera_filtered <- Protein_genera %>%
  filter(Genus %in% top_genera)

# Perform the Wilcoxon rank-sum test for each genus
Wil_Plant_vs_Animal <- lapply(top_genera, function(genus) {
  Plant_abundance <- Protein_genera_filtered$Abundance[Protein_genera_filtered$Genus == genus & Protein_genera_filtered$Protein_source == "Plant"]
  Animal_abundance <- Protein_genera_filtered$Abundance[Protein_genera_filtered$Genus == genus & Protein_genera_filtered$Protein_source == "Animal"]})

# Create the box plot
boxplot_data <- Protein_genera_filtered %>%
  filter(Protein_source %in% c("Plant", "Animal")) %>%
  ggplot(aes(x = Protein_source, y = Abundance, fill = Protein_source)) +
  geom_boxplot() + 
  scale_y_log10() + 
  facet_wrap(~Genus, scales = "free_y", nrow = 2) + 
  stat_compare_means(comparisons = list(c("Plant", "Animal")), label = 'p.signif') + 
  labs(x = "", y = "Log10 Abundance", fill = "Protein Source") + 
  theme_classic2() + 
  scale_fill_manual(values = c("Plant" = "blue", "Animal" = "red")) +  # Set custom colors
  theme(
    strip.text = element_text(face = "bold", size = 10),  # Make facet labels bold
    axis.text.x = element_blank(),  # Bold x-axis label
    axis.ticks.x = element_blank(),
    axis.title.y = element_text(face = "bold", size = 14)   # Bold y-axis label
  )

# Print the box plot
print(boxplot_data)